Introduction

In this report, we extract information about published JOSS papers and generate graphics as well as a summary table that can be downloaded and used for further analyses.

Load required R packages

suppressPackageStartupMessages({
  library(tibble)
  library(rcrossref)
  library(dplyr)
  library(tidyr)
  library(ggplot2)
  library(lubridate)
  library(gh)
  library(purrr)
  library(jsonlite)
  library(DT)
  library(plotly)
  library(citecorp)
  library(readr)
})
## Keep track of the source of each column
source_track <- c()

## Determine whether to add a caption with today's date to the (non-interactive) plots
add_date_caption <- TRUE
if (add_date_caption) {
  dcap <- lubridate::today()
} else {
  dcap <- ""
}
## Read archived version of summary data frame, to use for filling in 
## information about software repositories (due to limit on API requests)
## Sort by the date when software repo info was last obtained
papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>%
  dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained)

## Similarly for citation analysis, to avoid having to pull down the 
## same information multiple times
citations_archive <- readr::read_delim(
  url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"),
  col_types = cols(.default = "c"), col_names = TRUE,
  delim = "\t")

Collect information about papers

Pull down papers and citation info from Crossref

We get the information about published JOSS papers from Crossref, using the rcrossref R package. This package is also used to extract citation counts.

## Fetch JOSS papers from Crossref
## Only 1000 papers at the time can be pulled down
lim <- 1000
papers <- rcrossref::cr_works(filter = c(issn = "2475-9066"), 
                              limit = lim)$data
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
i <- 1
while (nrow(papers) == i * lim) {
  papers <- dplyr::bind_rows(
    papers, 
    rcrossref::cr_works(filter = c(issn = "2475-9066"), 
                        limit = lim, offset = i * lim)$data)
  i <- i + 1
}
papers <- papers %>%
  dplyr::filter(type == "journal-article") 

## A few papers don't have DOIs - generate them from the URL
noaltid <- which(is.na(papers$alternative.id))
papers$alternative.id[noaltid] <- gsub("http://dx.doi.org/", "",
                                       papers$url[noaltid])

## Get citation info from Crossref and merge with paper details
cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
papers <- papers %>% dplyr::left_join(
  cit %>% dplyr::rename(citation_count = count), 
  by = c("alternative.id" = "doi")
)

## Remove one duplicated paper
papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")

source_track <- c(source_track, 
                  structure(rep("crossref", ncol(papers)), 
                            names = colnames(papers)))

Pull down info from Whedon API

For each published paper, we use the Whedon API to get information about pre-review and review issue numbers, corresponding software repository etc.

whedon <- list()
p <- 1
a <- jsonlite::fromJSON(
  url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
  simplifyDataFrame = FALSE
)
while (length(a) > 0) {
  whedon <- c(whedon, a)
  p <- p + 1
  a <- jsonlite::fromJSON(
    url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
    simplifyDataFrame = FALSE
  )
}

whedon <- do.call(dplyr::bind_rows, lapply(whedon, function(w) {
  data.frame(api_title = w$title, 
             api_state = w$state,
             editor = paste(w$metadata$paper$editor, collapse = ","),
             reviewers = paste(w$reviewers, collapse = ","),
             nbr_reviewers = length(w$reviewers),
             repo_url = w$repository_url,
             review_issue_id = w$review_issue_id,
             doi = w$doi,
             prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
                                         w$meta_review_issue_id, NA_integer_),
             languages = paste(w$metadata$paper$languages, collapse = ","),
             archive_doi = w$metadata$paper$archive_doi)
}))

papers <- papers %>% dplyr::left_join(whedon, by = c("alternative.id" = "doi"))

source_track <- c(source_track, 
                  structure(rep("whedon", length(setdiff(colnames(papers),
                                                         names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Combine with info from GitHub issues

From each pre-review and review issue, we extract information about review times and assigned labels.

## Pull down info on all issues in the joss-reviews repository
issues <- gh("/repos/openjournals/joss-reviews/issues", 
             .limit = 5000, state = "all")
## From each issue, extract required information
iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
  data.frame(title = i$title, 
             number = i$number,
             state = i$state,
             opened = i$created_at,
             closed = ifelse(!is.null(i$closed_at),
                             i$closed_at, NA_character_),
             ncomments = i$comments,
             labels = paste(setdiff(
               vapply(i$labels, getElement, 
                      name = "name", character(1L)),
               c("review", "pre-review", "query-scope", "paused")),
               collapse = ","))
}))

## Split into REVIEW, PRE-REVIEW, and other issues (the latter category 
## is discarded)
issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) & 
                                    !grepl("\\[REVIEW\\]", title))
dim(issother)
## [1] 27  7
head(issother)
##                                                                                                                   title
## 1 @TheoChristiaanse Thanks for your submission! A very quick initial comment is that was not straightforward for me to:
## 2                                                             @torressa @poulson I only found a couple of small issues:
## 3                                                                                     Request to regenerate final proof
## 4                                                                                 issues running example program Karate
## 5                                                                                                      @whedon commands
## 6                                                                                                               @whedon
##   number  state               opened               closed ncomments labels
## 1   2652 closed 2020-09-08T16:33:13Z 2020-09-08T16:48:16Z         3       
## 2   2082 closed 2020-02-07T09:51:50Z 2020-02-07T09:52:09Z         2       
## 3   2045 closed 2020-01-28T14:44:07Z 2020-01-28T14:45:26Z         2       
## 4   2015 closed 2020-01-15T13:25:37Z 2020-01-15T15:05:18Z         3       
## 5   1898 closed 2019-11-17T09:44:23Z 2019-11-17T10:26:41Z         4       
## 6   1897 closed 2019-11-17T09:43:49Z 2019-11-17T10:26:30Z         4
## For REVIEW issues, generate the DOI of the paper from the issue number
getnbrzeros <- function(s) {
  paste(rep(0, 5 - nchar(s)), collapse = "")
}
issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
  dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
  dplyr::mutate(alternative.id = paste0("10.21105/joss.", 
                                        nbrzeros,
                                        number)) %>%
  dplyr::select(-nbrzeros) %>% 
  dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
  dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))

## Tabulate the number of pre-review issues labeled 'rejected' per year
iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>% 
  dplyr::filter(grepl("rejected", labels)) %>% 
  dplyr::mutate(year = lubridate::year(opened)) %>% 
  dplyr::group_by(year) %>% 
  dplyr::summarize(nbr_rejected = length(labels))
## # A tibble: 4 x 2
##    year nbr_rejected
##   <dbl>        <int>
## 1  2017            6
## 2  2018           16
## 3  2019           14
## 4  2020           87
## For PRE-REVIEW issues, add information about the corresponding REVIEW 
## issue number
isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
  dplyr::filter(!grepl("withdrawn", labels)) %>%
  dplyr::filter(!grepl("rejected", labels))
## Some titles have multiple pre-review issues. In these cases, keep the latest
isspre <- isspre %>% dplyr::arrange(desc(number)) %>% 
  dplyr::filter(!duplicated(title)) %>% 
  dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
  dplyr::rename_all(~ paste0("prerev_", .))

papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>% 
  dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
  dplyr::mutate(prerev_opened = as.Date(prerev_opened),
                prerev_closed = as.Date(prerev_closed),
                review_opened = as.Date(review_opened),
                review_closed = as.Date(review_closed)) %>% 
  dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
                days_in_rev = review_closed - review_opened,
                to_review = !is.na(review_opened))

source_track <- c(source_track, 
                  structure(rep("joss-github", length(setdiff(colnames(papers),
                                                              names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Add information from software repositories

## Reorder so that software repositories that were interrogated longest 
## ago are checked first
tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id),
                  na.last = FALSE)
software_urls <- papers$repo_url[tmporder]
is_github <- grepl("github", software_urls)
length(is_github)
## [1] 1029
sum(is_github)
## [1] 984
software_urls[!is_github]
##  [1] "https://bitbucket.org/cmutel/brightway2"                   
##  [2] "https://bitbucket.org/cloopsy/android/"                    
##  [3] "https://bitbucket.org/manuela_s/hcp/"                      
##  [4] "https://bitbucket.org/miketuri/perl-spice-sim-seus/"       
##  [5] "https://doi.org/10.17605/OSF.IO/3DS6A"                     
##  [6] "https://bitbucket.org/glotzer/rowan"                       
##  [7] "https://gitlab.com/moorepants/skijumpdesign"               
##  [8] "https://gitlab.com/toposens/public/ros-packages"           
##  [9] "https://gitlab.inria.fr/azais/treex"                       
## [10] "https://bitbucket.org/basicsums/basicsums"                 
## [11] "https://savannah.nongnu.org/projects/complot/"             
## [12] "http://mutabit.com/repos.fossil/grafoscopio/"              
## [13] "https://bitbucket.org/cardosan/brightway2-temporalis"      
## [14] "https://bitbucket.org/cdegroot/wediff"                     
## [15] "https://gitlab.com/materials-modeling/wulffpack"           
## [16] "https://gitlab.com/costrouc/pysrim"                        
## [17] "https://bitbucket.org/meg/cbcbeat"                         
## [18] "https://vcs.ynic.york.ac.uk/analysis/sails"                
## [19] "https://bitbucket.org/ocellarisproject/ocellaris"          
## [20] "https://gitlab.com/QComms/cqptoolkit"                      
## [21] "https://gitlab.com/dlr-dw/ontocode"                        
## [22] "https://gitlab.com/eidheim/Simple-Web-Server"              
## [23] "https://bitbucket.org/dghoshal/frieda"                     
## [24] "https://gitlab.com/tesch1/cppduals"                        
## [25] "https://gitlab.com/gdetor/genetic_alg"                     
## [26] "https://bitbucket.org/hammurabicode/hamx"                  
## [27] "https://gitlab.com/datafold-dev/datafold/"                 
## [28] "https://bitbucket.org/likask/mofem-cephas"                 
## [29] "https://www.idpoisson.fr/fullswof/"                        
## [30] "https://bitbucket.org/dolfin-adjoint/pyadjoint"            
## [31] "https://sourceforge.net/p/mcapl/mcapl_code/ci/master/tree/"
## [32] "https://gricad-gitlab.univ-grenoble-alpes.fr/ttk/spam/"    
## [33] "https://c4science.ch/source/tamaas/"                       
## [34] "https://gitlab.inria.fr/miet/miet"                         
## [35] "https://bitbucket.org/mpi4py/mpi4py-fft"                   
## [36] "https://gitlab.com/myqueue/myqueue"                        
## [37] "https://gitlab.com/cerfacs/batman"                         
## [38] "https://bitbucket.org/rram/dvrlib/src/joss/"               
## [39] "https://ts-gitlab.iup.uni-heidelberg.de/dorie/dorie"       
## [40] "https://gitlab.com/davidtourigny/dynamic-fba"              
## [41] "https://gitlab.com/celliern/scikit-fdiff/"                 
## [42] "https://gitlab.com/ampere2/metalwalls"                     
## [43] "https://ts-gitlab.iup.uni-heidelberg.de/utopia/utopia"     
## [44] "https://ts-gitlab.iup.uni-heidelberg.de/utopia/dantro"     
## [45] "https://gitlab.com/cosmograil/PyCS3"
df <- do.call(dplyr::bind_rows, lapply(software_urls[is_github], function(u) {
  u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u)))
  if (grepl("/tree/", u0)) {
    u0 <- strsplit(u0, "/tree/")[[1]][1]
  }
  if (grepl("/blob/", u0)) {
    u0 <- strsplit(u0, "/blob/")[[1]][1]
  }
  info <- try({
    gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
  })
  languages <- try({
    gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"), 
       .limit = 500)
  })
  contribs <- try({
    gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"), 
       .limit = 500)
  })
  if (!is(info, "try-error") && length(info) > 1) {
    if (!is(contribs, "try-error")) {
      if (length(contribs) == 0) {
        repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
      } else {
        repo_nbr_contribs <- length(contribs)
        repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
        if (is.na(repo_nbr_contribs_2ormore)) {
          print(contribs)
        }
      }
    } else {
      repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
    }
    
    if (!is(languages, "try-error")) {
      if (length(languages) == 0) {
        repolang <- ""
      } else {
        repolang <- paste(paste(gsub(" ", "", names(unlist(languages))), 
                                unlist(languages), sep = ":"), collapse = ",")
      }
    } else {
      repolang <- ""
    }
    data.frame(repo_url = u, 
               repo_created = info$created_at,
               repo_updated = info$updated_at,
               repo_pushed = info$pushed_at,
               repo_nbr_stars = info$stargazers_count,
               repo_language = ifelse(!is.null(info$language),
                                      info$language, NA_character_),
               repo_languages_bytes = repolang,
               repo_license = ifelse(!is.null(info$license),
                                     info$license$key, NA_character_),
               repo_nbr_contribs = repo_nbr_contribs,
               repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
    )
  } else {
    NULL
  }
})) %>%
  dplyr::mutate(repo_created = as.Date(repo_created),
                repo_updated = as.Date(repo_updated),
                repo_pushed = as.Date(repo_pushed)) %>%
  dplyr::distinct() %>%
  dplyr::mutate(repo_info_obtained = lubridate::today())
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)

## For papers not in df (i.e., for which we didn't get a valid response
## from the GitHub API query), use information from the archived data frame
dfarchive <- papers_archive %>% 
  dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>%
  dplyr::filter(!(repo_url %in% df$repo_url))
df <- dplyr::bind_rows(df, dfarchive)

papers <- papers %>% dplyr::left_join(df, by = "repo_url")

source_track <- c(source_track, 
                  structure(rep("sw-github", length(setdiff(colnames(papers),
                                                            names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Clean up a bit

## Convert publication date to Date format
## Add information about the half year (H1, H2) of publication
## Count number of authors
papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
  dplyr::mutate(published.date = as.Date(published.print)) %>% 
  dplyr::mutate(
    halfyear = paste0(year(published.date), 
                      ifelse(month(published.date) <= 6, "H1", "H2"))
  ) %>% dplyr::mutate(
    halfyear = factor(halfyear, 
                      levels = paste0(rep(sort(unique(year(published.date))), 
                                          each = 2), c("H1", "H2")))
  ) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))

source_track <- c(source_track, 
                  structure(rep("cleanup", length(setdiff(colnames(papers),
                                                          names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Tabulate number of missing values

In some cases, fetching information from (e.g.) the GitHub API fails for a subset of the publications. There are also other reasons for missing values (for example, the earliest submissions do not have an associated pre-review issue). The table below lists the number of missing values for each of the variables in the data frame.

DT::datatable(
  data.frame(variable = colnames(papers),
             nbr_missing = colSums(is.na(papers))) %>%
    dplyr::mutate(source = source_track[variable]),
  escape = FALSE, rownames = FALSE, 
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)

Number of published papers per month and year

ggplot(papers %>% 
         dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
         dplyr::group_by(pubmonth) %>%
         dplyr::summarize(npub = n()), 
       aes(x = factor(pubmonth), y = npub)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  labs(x = "", y = "Number of published papers per month", caption = dcap) + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(papers %>% 
         dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
         dplyr::group_by(pubyear) %>%
         dplyr::summarize(npub = n()), 
       aes(x = factor(pubyear), y = npub)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  labs(x = "", y = "Number of published papers per year", caption = dcap) + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Citation distribution

Papers with 20 or more citations are grouped in the “>=20” category.

ggplot(papers %>% 
         dplyr::mutate(citation_count = replace(citation_count,
                                                citation_count >= 20, ">=20")) %>%
         dplyr::mutate(citation_count = factor(citation_count, 
                                               levels = c(0:20, ">=20"))) %>%
         dplyr::group_by(citation_count) %>%
         dplyr::tally(),
       aes(x = citation_count, y = n)) + 
  geom_bar(stat = "identity") + 
  theme_minimal() + 
  labs(x = "Crossref citation count", y = "Number of publications", caption = dcap)

Most cited papers

The table below sorts the JOSS papers in decreasing order by the number of citations in Crossref.

DT::datatable(
  papers %>% 
    dplyr::mutate(url = paste0("<a href='", url, "' target='_blank'>", 
                               url,"</a>")) %>% 
    dplyr::arrange(desc(citation_count)) %>% 
    dplyr::select(title, url, published.date, citation_count),
  escape = FALSE,
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)

Citation count vs time since publication

plotly::ggplotly(
  ggplot(papers, aes(x = published.date, y = citation_count, label = title)) + 
    geom_point(alpha = 0.5) + theme_bw() + scale_y_sqrt() + 
    geom_smooth() + 
    labs(x = "Date of publication", y = "Crossref citation count", caption = dcap) + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Power law of citation count within each half year

Here, we plot the citation count for all papers published within each half year, sorted in decreasing order.

ggplot(papers %>% dplyr::group_by(halfyear) %>% 
         dplyr::arrange(desc(citation_count)) %>%
         dplyr::mutate(idx = seq_along(citation_count)), 
       aes(x = idx, y = citation_count)) + 
  geom_point(alpha = 0.5) + 
  facet_wrap(~ halfyear, scales = "free") + 
  theme_bw() + 
  labs(x = "Index", y = "Crossref citation count", caption = dcap)

Pre-review/review time over time

In these plots we investigate whether the time a submission spends in the pre-review or review stage has changed over time.

ggplot(papers, aes(x = prerev_opened, y = as.numeric(days_in_pre))) + 
  geom_point() + geom_smooth() + theme_bw() + 
  labs(x = "Date of pre-review opening", y = "Number of days in pre-review", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

ggplot(papers, aes(x = review_opened, y = as.numeric(days_in_rev))) + 
  geom_point() + geom_smooth() + theme_bw() + 
  labs(x = "Date of review opening", y = "Number of days in review", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

Languages

Next, we consider the languages used by the submissions. Note that a given submission can use multiple languages.

sspl <- strsplit(papers$languages, ",")
all_languages <- unique(unlist(sspl))
langs <- do.call(dplyr::bind_rows, lapply(all_languages, function(l) {
  data.frame(language = l,
             nbr_submissions = sum(vapply(sspl, function(v) l %in% v, 0)))
}))
langs %>% dplyr::arrange(desc(nbr_submissions))
##            language nbr_submissions
## 1               TeX             885
## 2            Python             603
## 3             Shell             278
## 4                 R             263
## 5          Makefile             206
## 6               C++             156
## 7  Jupyter Notebook             123
## 8              HTML             123
## 9                 C              81
## 10        Batchfile              68
## 11              CSS              66
## 12            CMake              59
## 13       JavaScript              55
## 14           Matlab              48
## 15            Julia              34
## 16       PowerShell              27
## 17          Fortran              24
## 18             Ruby              19
## 19             Java              19
## 20             Perl              12
## 21            Rebol              11
## 22               M4               7
## 23               Go               7
## 24             GLSL               7
## 25               C#               6
## 26             Roff               6
## 27                M               6
## 28             Cuda               5
## 29      Mathematica               5
## 30             Rust               5
## 31           Prolog               4
## 32              PHP               4
## 33           Smarty               4
## 34              IDL               3
## 35      Objective-C               3
## 36            Scala               3
## 37           Groovy               3
## 38            QMake               3
## 39              Awk               2
## 40             NSIS               2
## 41        Smalltalk               2
## 42         Assembly               2
## 43             Stan               2
## 44             XSLT               2
## 45         IGOR Pro               2
## 46              Vue               2
## 47            OCaml               2
## 48              Tcl               2
## 49              GAP               2
## 50              Lua               2
## 51       AGS Script               1
## 52            PLSQL               1
## 53      Common Lisp               1
## 54               eC               1
## 55            Lasso               1
## 56          Gnuplot               1
## 57            Stata               1
## 58              wdl               1
## 59           Kotlin               1
## 60       Inno Setup               1
## 61            Logos               1
## 62             Yacc               1
## 63     CoffeeScript               1
## 64               F#               1
## 65           XQuery               1
## 66                D               1
## 67             Golo               1
## 68           Scheme               1
## 69           Puppet               1
## 70            HyPhy               1
## 71       FreeMarker               1
## 72     UnrealScript               1
## 73            ANTLR               1
## 74       SourcePawn               1
## 75              Max               1
## 76              QML               1
## 77           JSONiq               1
## 78       TypeScript               1
## 79             Mako               1
## 80       Emacs Lisp               1
## 81             Hack               1
## 82     OpenEdge ABL               1
ggplot(langs %>% dplyr::arrange(desc(nbr_submissions)) %>%
         dplyr::mutate(language = factor(language, levels = language)),
       aes(x = language, y = nbr_submissions)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  labs(x = "", y = "Number of submissions", caption = dcap) + 
  theme(axis.title = element_text(size = 15))

We’ll also list the languages ordered by the number of bytes of code written in each (based on available GitHub repositories), and plot the number of repositories using a given language vs the total number of bytes written in the language.

a <- lapply(strsplit(papers$repo_languages_bytes, ","), function(w) strsplit(w, ":"))
a <- a[sapply(a, length) > 0]
langbytes <- as.data.frame(t(as.data.frame(a))) %>% 
  setNames(c("language", "bytes")) %>%
  dplyr::mutate(bytes = as.numeric(bytes)) %>%
  dplyr::filter(!is.na(language)) %>%
  dplyr::group_by(language) %>%
  dplyr::summarize(n_bytes = sum(bytes),
                   n_repos = length(bytes)) %>%
  dplyr::arrange(desc(n_bytes))
DT::datatable(
  langbytes,
  escape = FALSE,
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)
ggplot(langbytes, aes(x = n_repos, y = n_bytes)) + 
  geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth() + 
  theme_bw() + 
  labs(x = "Number of repos using the language",
       y = "Total number of bytes of code\nwritten in the language", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Association between number of citations and number of stars of the GitHub repo

ggplotly(
  ggplot(papers, aes(x = citation_count, y = repo_nbr_stars,
                     label = title)) + 
    geom_point(alpha = 0.5) + scale_x_sqrt() + scale_y_sqrt() + 
    theme_bw() + 
    labs(x = "Crossref citation count", y = "Number of stars, GitHub repo", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Distribution of time between GitHub repo creation and JOSS submission

ggplot(papers, aes(x = as.numeric(prerev_opened - repo_created))) +
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(x = "Time (days) from repo creation to JOSS pre-review start", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

Distribution of time between JOSS acceptance and last commit

ggplot(papers, aes(x = as.numeric(repo_pushed - review_closed))) +
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(x = "Time (days) from closure of JOSS review to most recent commit in repo",
       caption = dcap) + 
  theme(axis.title = element_text(size = 15)) + 
  facet_wrap(~ year(published.date), scales = "free_y")

Number of authors per paper

List the papers with the largest number of authors, and display the distribution of the number of authors per paper, for papers with at most 20 authors.

## Papers with largest number of authors
papers %>% dplyr::arrange(desc(nbr_authors)) %>% 
  dplyr::select(title, published.date, url, nbr_authors) %>%
  as.data.frame() %>% head(10)
##                                                                                   title
## 1                                             SunPy: A Python package for Solar Physics
## 2                 ENZO: An Adaptive Mesh Refinement Code for Astrophysics (Version 2.6)
## 3                                                PyBIDS: Python tools for BIDS datasets
## 4                                     Chaste: Cancer, Heart and Soft Tissue Environment
## 5                                    spam: Software for Practical Analysis of Materials
## 6                                                 VIVO: a system for research discovery
## 7                                                              Welcome to the Tidyverse
## 8  Pyglmnet: Python implementation of elastic-net regularized generalized linear models
## 9                        HPX - The C++ Standard Library for Parallelism and Concurrency
## 10                     AMReX: a framework for block-structured adaptive mesh refinement
##    published.date                                   url nbr_authors
## 1      2020-02-14 http://dx.doi.org/10.21105/joss.01832         124
## 2      2019-10-03 http://dx.doi.org/10.21105/joss.01636          55
## 3      2019-08-12 http://dx.doi.org/10.21105/joss.01294          31
## 4      2020-03-13 http://dx.doi.org/10.21105/joss.01848          29
## 5      2020-07-13 http://dx.doi.org/10.21105/joss.02286          27
## 6      2019-07-26 http://dx.doi.org/10.21105/joss.01182          25
## 7      2019-11-21 http://dx.doi.org/10.21105/joss.01686          24
## 8      2020-03-01 http://dx.doi.org/10.21105/joss.01959          22
## 9      2020-09-02 http://dx.doi.org/10.21105/joss.02352          19
## 10     2019-05-12 http://dx.doi.org/10.21105/joss.01370          17
nbins <- max(papers$nbr_authors[papers$nbr_authors <= 20])
ggplot(papers %>% dplyr::filter(nbr_authors <= 20),
  aes(x = nbr_authors)) + 
  geom_histogram(bins = nbins, fill = "lightgrey", color = "grey50") + 
  theme_bw() + 
  facet_wrap(~ year(published.date), scales = "free_y") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Number of authors",
       y = "Number of publications with\na given number of authors", 
       caption = dcap)

ggplot(papers %>% 
         dplyr::mutate(nbr_authors = replace(nbr_authors, nbr_authors > 5, ">5")) %>%
         dplyr::mutate(nbr_authors = factor(nbr_authors, levels = c("1", "2", "3", 
                                                                    "4", "5", ">5"))) %>%
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(year = factor(year)) %>%
         dplyr::group_by(year, nbr_authors, .drop = FALSE) %>%
         dplyr::summarize(n = n()) %>%
         dplyr::mutate(freq = n/sum(n)) %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = nbr_authors)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Number of\nauthors", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Fraction of submissions", caption = dcap)

Number of authors vs number of contributors to the GitHub repo

Note that points are slightly jittered to reduce the overlap.

plotly::ggplotly(
  ggplot(papers, aes(x = nbr_authors, y = repo_nbr_contribs_2ormore, label = title)) + 
    geom_abline(slope = 1, intercept = 0) + 
    geom_jitter(width = 0.05, height = 0.05, alpha = 0.5) + 
    # geom_point(alpha = 0.5) + 
    theme_bw() + 
    scale_x_sqrt() + scale_y_sqrt() + 
    labs(x = "Number of authors", 
         y = "Number of contributors\nwith at least 2 commits", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Number of reviewers per paper

Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.

ggplot(papers %>%
         dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
         dplyr::mutate(year = year(published.date)),
       aes(x = nbr_reviewers)) + geom_bar() + 
  facet_wrap(~ year) + theme_bw() + 
  labs(x = "Number of reviewers", y = "Number of submissions", caption = dcap)

Most active reviewers

Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.

reviewers <- papers %>% 
  dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
  dplyr::mutate(year = year(published.date)) %>%
  dplyr::select(reviewers, year) %>%
  tidyr::separate_rows(reviewers, sep = ",")

## Most active reviewers
DT::datatable(
  reviewers %>% dplyr::group_by(reviewers) %>%
    dplyr::summarize(nbr_reviews = length(year),
                     timespan = paste(unique(c(min(year), max(year))), 
                                      collapse = " - ")) %>%
    dplyr::arrange(desc(nbr_reviews)),
  escape = FALSE, rownames = FALSE, 
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)

Number of papers per editor and year

ggplot(papers %>% 
         dplyr::mutate(year = year(published.date),
                       `r/pyOpenSci` = factor(
                         grepl("rOpenSci|pyOpenSci", prerev_labels),
                         levels = c("TRUE", "FALSE"))), 
       aes(x = editor)) + geom_bar(aes(fill = `r/pyOpenSci`)) + 
  theme_bw() + facet_wrap(~ year, ncol = 1) + 
  scale_fill_manual(values = c(`TRUE` = "grey65", `FALSE` = "grey35")) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  labs(x = "Editor", y = "Number of submissions", caption = dcap)

Distribution of software repo licenses

all_licenses <- sort(unique(papers$repo_license))
license_levels = c(grep("apache", all_licenses, value = TRUE),
                   grep("bsd", all_licenses, value = TRUE),
                   grep("mit", all_licenses, value = TRUE),
                   grep("gpl", all_licenses, value = TRUE),
                   grep("mpl", all_licenses, value = TRUE))
license_levels <- c(license_levels, setdiff(all_licenses, license_levels))
ggplot(papers %>% 
         dplyr::mutate(repo_license = factor(repo_license, 
                                             levels = license_levels)),
       aes(x = repo_license)) +
  geom_bar() + 
  theme_bw() + 
  labs(x = "Software license", y = "Number of submissions", caption = dcap) + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  facet_wrap(~ year(published.date), scales = "free_y")

## For plots below, replace licenses present in less 
## than 2.5% of the submissions by 'other'
tbl <- table(papers$repo_license)
to_replace <- names(tbl[tbl <= 0.025 * nrow(papers)])
ggplot(papers %>% 
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(repo_license = replace(repo_license, 
                                              repo_license %in% to_replace,
                                              "other")) %>%
         dplyr::mutate(year = factor(year), 
                       repo_license = factor(
                         repo_license, 
                         levels = license_levels[license_levels %in% repo_license]
                       )) %>%
         dplyr::group_by(year, repo_license, .drop = FALSE) %>%
         dplyr::count() %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = n, fill = repo_license)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Number of submissions", caption = dcap)

ggplot(papers %>% 
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(repo_license = replace(repo_license, 
                                              repo_license %in% to_replace,
                                              "other")) %>%
         dplyr::mutate(year = factor(year), 
                       repo_license = factor(
                         repo_license, 
                         levels = license_levels[license_levels %in% repo_license]
                       )) %>%
         dplyr::group_by(year, repo_license, .drop = FALSE) %>%
         dplyr::summarize(n = n()) %>%
         dplyr::mutate(freq = n/sum(n)) %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = repo_license)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Fraction of submissions", caption = dcap)

Citation analysis [work in progress]

Here, we take a more detailed look at the papers that cite JOSS papers, using data from the Open Citations Corpus.

Get citing papers for each submission

citations <- citecorp::oc_coci_cites(doi = papers$alternative.id) %>%
  dplyr::distinct()
dim(citations)
## [1] 3261    7
citations <- citations %>% 
  dplyr::filter(!(oci %in% citations_archive$oci))
dim(citations)
## [1] 3122    7
## temporary
citations <- citations[1:1000, ]

tmpj <- rcrossref::cr_works(dois = unique(citations$citing))$data %>%
      dplyr::select(contains("doi"), contains("container.title"), contains("issn"),
                    contains("type"), contains("publisher"), contains("prefix"))
citations <- citations %>% dplyr::left_join(tmpj, by = c("citing" = "doi"))
dim(citations)
## [1] 1000   12
## bioRxiv preprints don't have a 'container.title' or 'issn', but we'll assume 
## that they can be 
## identified from the prefix 10.1101 - set the container.title 
## for these records manually; we may or may not want to count these
## (would it count citations twice, both preprint and publication?)
citations$container.title[citations$prefix == "10.1101"] <- "bioRxiv"

## JOSS is represented by 'The Journal of Open Source Software' as well as 
## 'Journal of Open Source Software'
citations$container.title[citations$container.title == 
                            "Journal of Open Source Software"] <- 
  "The Journal of Open Source Software"

## Remove real self citations (cited DOI = citing DOI)
citations <- citations %>% dplyr::filter(cited != citing)

## Merge with the archive
citations <- dplyr::bind_rows(citations, citations_archive)

write.table(citations, file = "joss_submission_citations.tsv",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)

Summary statistics

## Number of JOSS papers with >0 citations included in this collection
length(unique(citations$cited))
## [1] 189
## Number of JOSS papers with >0 citations according to Crossref
length(which(papers$citation_count > 0))
## [1] 566
## Number of citations from Open Citations Corpus vs Crossref
df0 <- papers %>% dplyr::select(doi, citation_count) %>%
  dplyr::full_join(citations %>% dplyr::group_by(cited) %>%
                     dplyr::tally() %>%
                     dplyr::mutate(n = replace(n, is.na(n), 0)),
                   by = c("doi" = "cited"))
## Total citation count Crossref
sum(df0$citation_count, na.rm = TRUE)
## [1] 4988
## Total citation count Open Citations Corpus
sum(df0$n, na.rm = TRUE)
## [1] 1144
## Ratio of total citation count Open Citations Corpus/Crossref
sum(df0$n, na.rm = TRUE)/sum(df0$citation_count, na.rm = TRUE)
## [1] 0.2293504
ggplot(df0, aes(x = citation_count, y = n)) + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_point(size = 3, alpha = 0.5) + 
  labs(x = "Crossref citation count", y = "Open Citations Corpus citation count",
       caption = dcap) + 
  theme_bw()

## Zoom in
ggplot(df0, aes(x = citation_count, y = n)) + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_point(size = 3, alpha = 0.5) + 
  labs(x = "Crossref citation count", y = "Open Citations Corpus citation count",
       caption = dcap) + 
  theme_bw() + 
  coord_cartesian(xlim = c(0, 75), ylim = c(0, 75))

## Number of journals citing JOSS papers
length(unique(citations$container.title))
## [1] 426
length(unique(citations$issn))
## [1] 356

Most citing journals

topcit <- citations %>% dplyr::group_by(container.title) %>%
  dplyr::summarize(nbr_citations_of_joss_papers = length(cited),
                   nbr_cited_joss_papers = length(unique(cited)),
                   nbr_citing_papers = length(unique(citing)),
                   nbr_selfcitations_of_joss_papers = sum(author_sc == "yes"),
                   fraction_selfcitations = signif(nbr_selfcitations_of_joss_papers /
                     nbr_citations_of_joss_papers, digits = 3)) %>%
  dplyr::arrange(desc(nbr_cited_joss_papers))
DT::datatable(topcit,
  escape = FALSE, rownames = FALSE, 
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE))
plotly::ggplotly(
  ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
                     label = container.title)) + 
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
    geom_point(size = 3, alpha = 0.5) + 
    theme_bw() + 
    labs(caption = dcap, x = "Number of citations of JOSS papers",
         y = "Number of cited JOSS papers")
)
plotly::ggplotly(
  ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
                     label = container.title)) + 
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
    geom_point(size = 3, alpha = 0.5) + 
    theme_bw() + 
    coord_cartesian(xlim = c(0, 100), ylim = c(0, 50)) + 
    labs(caption = dcap, x = "Number of citations of JOSS papers",
         y = "Number of cited JOSS papers")
)
write.table(topcit, file = "joss_submission_citations_byjournal.tsv",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)

Save object

The tibble object with all data collected above is serialized to a file that can be downloaded and reused.

head(papers) %>% as.data.frame()
##        alternative.id                     container.title    created  deposited
## 1 10.21105/joss.00900     Journal of Open Source Software 2018-09-23 2018-09-23
## 2 10.21105/joss.00236 The Journal of Open Source Software 2017-04-19 2019-09-21
## 3 10.21105/joss.02581     Journal of Open Source Software 2020-08-26 2020-08-26
## 4 10.21105/joss.02520     Journal of Open Source Software 2020-08-26 2020-08-26
## 5 10.21105/joss.01423     Journal of Open Source Software 2019-05-08 2019-11-17
## 6 10.21105/joss.01614     Journal of Open Source Software 2019-08-20 2019-11-17
##   published.print                 doi    indexed      issn issue     issued
## 1      2018-09-23 10.21105/joss.00900 2020-03-10 2475-9066    29 2018-09-23
## 2      2017-04-19 10.21105/joss.00236 2020-08-26 2475-9066    12 2017-04-19
## 3      2020-08-26 10.21105/joss.02581 2020-08-26 2475-9066    52 2020-08-26
## 4      2020-08-26 10.21105/joss.02520 2020-08-26 2475-9066    52 2020-08-26
## 5      2019-05-08 10.21105/joss.01423 2020-04-07 2475-9066    37 2019-05-08
## 6      2019-08-20 10.21105/joss.01614 2020-02-14 2475-9066    40 2019-08-20
##   member page   prefix        publisher reference.count score   source
## 1   8722  900 10.21105 The Open Journal               9     1 Crossref
## 2   8722  236 10.21105 The Open Journal              12     1 Crossref
## 3   8722 2581 10.21105 The Open Journal              10     1 Crossref
## 4   8722 2520 10.21105 The Open Journal              11     1 Crossref
## 5   8722 1423 10.21105 The Open Journal               9     1 Crossref
## 6   8722 1614 10.21105 The Open Journal               7     1 Crossref
##                                                                                                                            title
## 1                                                                                      GB code: A grain boundary generation code
## 2                                                                  Brightway: An open source framework for Life Cycle Assessment
## 3 SALSA: A Python Package for Constructing Synthetic Quasar Absorption Line Catalogs from Astrophysical Hydrodynamic Simulations
## 4                                                                          Gridap: An extensible Finite Element toolbox in Julia
## 5                                                                                    CRED: a rapid peak caller for Chem-seq data
## 6                                                                          drms: A Python package for accessing HMI and AIA data
##              type                                   url volume
## 1 journal-article http://dx.doi.org/10.21105/joss.00900      3
## 2 journal-article http://dx.doi.org/10.21105/joss.00236      2
## 3 journal-article http://dx.doi.org/10.21105/joss.02581      5
## 4 journal-article http://dx.doi.org/10.21105/joss.02520      5
## 5 journal-article http://dx.doi.org/10.21105/joss.01423      4
## 6 journal-article http://dx.doi.org/10.21105/joss.01614      4
##                                                                                                                                                                                                                                                                                                                                                                        author
## 1                                                                                                                                                                                                  http://orcid.org/0000-0002-9616-4602, http://orcid.org/0000-0003-4281-5665, NA, FALSE, FALSE, NA, R., B., J., Hadian, Grabowski, Neugebauer, first, additional, additional
## 2                                                                                                                                                                                                                                                                                                            http://orcid.org/0000-0002-7898-9862, FALSE, Chris, Mutel, first
## 3                                                                                                                       http://orcid.org/0000-0003-0872-7098, NA, NA, NA, NA, NA, FALSE, NA, NA, NA, NA, NA, Brendan, Devin, Brian, Jason, Molly, Nicholas, Boyd, Silvia, O’Shea, Tumlinson, Peeples, Earl, first, additional, additional, additional, additional, additional
## 4                                                                                                                                                                                                                             http://orcid.org/0000-0003-2391-4086, http://orcid.org/0000-0003-3667-443X, FALSE, FALSE, Santiago, Francesc, Badia, Verdugo, first, additional
## 5                                                                                          http://orcid.org/0000-0002-8086-3185, http://orcid.org/0000-0003-2358-7919, http://orcid.org/0000-0002-0916-7339, http://orcid.org/0000-0002-3992-5399, FALSE, FALSE, FALSE, FALSE, Jason, Tony, Paul, Hiroki, Lin, Kuo, Horton, Nagase, first, additional, additional, additional
## 6 http://orcid.org/0000-0002-1361-5712, http://orcid.org/0000-0002-5662-9604, http://orcid.org/0000-0001-6915-4583, http://orcid.org/0000-0002-0361-6463, http://orcid.org/0000-0003-4217-4642, FALSE, FALSE, FALSE, FALSE, FALSE, Kolja, Monica, Nitin, Arthur, Stuart, Glogowski, Bobra, Choudhary, Amezcua, Mumford, first, additional, additional, additional, additional
##   citation_count
## 1              1
## 2             39
## 3              0
## 4              0
## 5              1
## 6              2
##                                                                                                                        api_title
## 1                                                                                      GB code: A grain boundary generation code
## 2                                                                  Brightway: An open source framework for Life Cycle Assessment
## 3 SALSA: A Python Package for Constructing Synthetic Quasar Absorption Line Catalogs from Astrophysical Hydrodynamic Simulations
## 4                                                                          Gridap: An extensible Finite Element toolbox in Julia
## 5                                                                                    CRED: a rapid peak caller for Chem-seq data
## 6                                                                          drms: A Python package for accessing HMI and AIA data
##   api_state                  editor                      reviewers
## 1  accepted                @labarba               @vyasr,@trallard
## 2  accepted               @katyhuff                        @amoeba
## 3  accepted            @danielskatz                @olebole,@zpace
## 4  accepted @Kevin-Mattheus-Moerman @PetrKryslUCSD,@TeroFrondelius
## 5  accepted               @lpantano                       @darogan
## 6  accepted                 @xuanxu       @mgckind,@aureliocarnero
##   nbr_reviewers                                repo_url review_issue_id
## 1             2    https://github.com/oekosheri/GB_code             900
## 2             1 https://bitbucket.org/cmutel/brightway2             236
## 3             2         https://github.com/biboyd/SALSA            2581
## 4             2     https://github.com/gridap/Gridap.jl            2520
## 5             1        https://github.com/jlincbio/cred            1423
## 6             2           https://github.com/sunpy/drms            1614
##   prereview_issue_id                              languages
## 1                853                             Python,TeX
## 2                228 Python,Shell,Jupyter Notebook,HTML,TeX
## 3               2532      Jupyter Notebook,TeX,Shell,Python
## 4               2464                        Julia,Shell,TeX
## 5               1374                    Makefile,Perl,C,TeX
## 6               1559                             Python,TeX
##                               archive_doi
## 1  https://doi.org/10.5281/zenodo.1433530
## 2 http://dx.doi.org/10.5281/zenodo.556145
## 3  https://doi.org/10.5281/zenodo.4002067
## 4  https://doi.org/10.5281/zenodo.3999839
## 5  https://doi.org/10.5281/zenodo.2667613
## 6  https://doi.org/10.5281/zenodo.3369966
##                                                                                                                     review_title
## 1                                                                                      GB_code: A grain boundary generation code
## 2                                                                  Brightway: An open source framework for life cycle assessment
## 3 SALSA: A Python Package for Constructing Synthetic Quasar Absorption Line Catalogs from Astrophysical Hydrodynamic Simulations
## 4                                                                          Gridap: An extensible Finite Element toolbox in Julia
## 5                                                                                    CRED: a rapid peak caller for Chem-seq data
## 6                                                                          drms: A Python package for accessing HMI and AIA data
##   review_number review_state review_opened review_closed review_ncomments
## 1           900       closed    2018-08-17    2018-09-23               90
## 2           236       closed    2017-04-13    2017-04-19               18
## 3          2581       closed    2020-08-18    2020-08-26               38
## 4          2520       closed    2020-07-26    2020-08-26               74
## 5          1423       closed    2019-05-01    2019-05-08               93
## 6          1614       closed    2019-08-01    2019-08-20               62
##                                                    review_labels
## 1                            accepted,published,recommend-accept
## 2                            accepted,published,recommend-accept
## 3 Jupyter Notebook,Shell,TeX,accepted,published,recommend-accept
## 4                  Julia,TeX,accepted,published,recommend-accept
## 5                            accepted,published,recommend-accept
## 6                            accepted,published,recommend-accept
##                                                                                                                     prerev_title
## 1                                                                                      GB_code: A grain boundary generation code
## 2                                                                  Brightway: An open source framework for life cycle assessment
## 3 SALSA: A Python Package for Constructing Synthetic Quasar Absorption Line Catalogs from Astrophysical Hydrodynamic Simulations
## 4                                                                          Gridap: An extensible Finite Element toolbox in Julia
## 5                                                                                    CRED: a rapid peak caller for Chem-seq data
## 6                                                                          drms: A Python package for accessing HMI and AIA data
##   prerev_state prerev_opened prerev_closed prerev_ncomments
## 1       closed    2018-07-26    2018-08-17               42
## 2       closed    2017-04-06    2017-04-13               14
## 3       closed    2020-07-28    2020-08-18               44
## 4       closed    2020-07-10    2020-07-26               47
## 5       closed    2019-04-14    2019-05-01               30
## 6       closed    2019-07-11    2019-08-01               29
##                prerev_labels days_in_pre days_in_rev to_review repo_created
## 1                 Python,TeX     22 days     37 days      TRUE   2018-07-12
## 2                                 7 days      6 days      TRUE         <NA>
## 3 Jupyter Notebook,Shell,TeX     21 days      8 days      TRUE   2020-06-11
## 4                  Julia,TeX     16 days     31 days      TRUE   2019-03-15
## 5            C,Makefile,Perl     17 days      7 days      TRUE   2019-04-10
## 6                     Python     21 days     19 days      TRUE   2016-05-12
##   repo_updated repo_pushed repo_nbr_stars repo_language
## 1   2020-09-23  2019-08-08             16        Python
## 2         <NA>        <NA>             NA          <NA>
## 3   2020-08-27  2020-08-27              2        Python
## 4   2020-09-21  2020-09-24            112         Julia
## 5   2020-02-26  2020-02-26              1             C
## 6   2020-04-16  2020-04-16             11        Python
##                                    repo_languages_bytes repo_license
## 1                                 Python:47666,TeX:2712          mit
## 2                                                  <NA>         <NA>
## 3 Python:97186,JupyterNotebook:21308,TeX:8559,Shell:546 bsd-3-clause
## 4                                         Julia:1093861          mit
## 5               C:30602,Perl:5262,TeX:3999,Makefile:960      gpl-3.0
## 6                                Python:180067,TeX:7895          mit
##   repo_nbr_contribs repo_nbr_contribs_2ormore repo_info_obtained published.date
## 1                 2                         2         2020-09-26     2018-09-23
## 2                NA                        NA               <NA>     2017-04-19
## 3                 2                         1         2020-09-26     2020-08-26
## 4                11                         8         2020-09-26     2020-08-26
## 5                 2                         1         2020-09-26     2019-05-08
## 6                 7                         5         2020-09-26     2019-08-20
##   halfyear nbr_authors
## 1   2018H2           3
## 2   2017H1           1
## 3   2020H2           6
## 4   2020H2           2
## 5   2019H1           4
## 6   2019H2           5
saveRDS(papers, file = "joss_submission_analytics.rds")

To read the current version of this file directly from GitHub, use the following code:

papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true")))

Session info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] readr_1.3.1     citecorp_0.3.0  plotly_4.9.2.1  DT_0.15        
##  [5] jsonlite_1.7.1  purrr_0.3.4     gh_1.1.0        lubridate_1.7.9
##  [9] ggplot2_3.3.2   tidyr_1.1.2     dplyr_1.0.2     rcrossref_1.0.0
## [13] tibble_3.0.3   
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2         viridisLite_0.3.0  splines_4.0.2      shiny_1.5.0       
##  [5] assertthat_0.2.1   triebeard_0.3.0    urltools_1.7.3     yaml_2.2.1        
##  [9] pillar_1.4.6       lattice_0.20-41    glue_1.4.2         digest_0.6.25     
## [13] RColorBrewer_1.1-2 promises_1.1.1     colorspace_1.4-1   Matrix_1.2-18     
## [17] htmltools_0.5.0    httpuv_1.5.4       plyr_1.8.6         pkgconfig_2.0.3   
## [21] bibtex_0.4.2.3     httpcode_0.3.0     xtable_1.8-4       scales_1.1.1      
## [25] whisker_0.4        later_1.1.0.1      mgcv_1.8-31        generics_0.0.2    
## [29] farver_2.0.3       ellipsis_0.3.1     withr_2.3.0        lazyeval_0.2.2    
## [33] cli_2.0.2          magrittr_1.5       crayon_1.3.4       mime_0.9          
## [37] evaluate_0.14      fansi_0.4.1        nlme_3.1-148       xml2_1.3.2        
## [41] tools_4.0.2        data.table_1.13.0  hms_0.5.3          lifecycle_0.2.0   
## [45] stringr_1.4.0      munsell_0.5.0      compiler_4.0.2     rlang_0.4.7       
## [49] grid_4.0.2         rstudioapi_0.11    htmlwidgets_1.5.1  crosstalk_1.1.0.1 
## [53] miniUI_0.1.1.1     labeling_0.3       rmarkdown_2.3      gtable_0.3.0      
## [57] curl_4.3           fauxpas_0.5.0      R6_2.4.1           knitr_1.30        
## [61] fastmap_1.0.1      utf8_1.1.4         stringi_1.5.3      crul_1.0.0        
## [65] Rcpp_1.0.5         vctrs_0.3.4        tidyselect_1.1.0   xfun_0.17